Simple regression using Scikit-Learn

I had to do a regression, perfect opportunity to use Scikit-learn... This was largely inspired by this more complete example

Importing libraries...


In [ ]:
import re
from sklearn import datasets, linear_model
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Formatting and cleaning data :

Data comes from tab separated values in an HTML page, end line are just break-lines


In [17]:
a = "32     0.000<br>       64     0.002<br>      128     0.016<br>      256     0.212<br>      512     2.102<br>     1024    22.744<br>     2048   248.011<br>     4096  2670.298"
tab_N = []
tab_time = []
p = re.compile('\s*(\d+)\s+(\d+.\d+)')
for i in a.split("<br>"):
    m = p.match(i)
    tab_N.append(int(m.group(1)))
    tab_time.append(float(m.group(2)))
s_N = pd.Series(tab_N[1:])
s_time = pd.Series(tab_time[1:])
plt.scatter(s_N,s_time)
#Data appears to be exponential... Let's do a log...
s_log_N = np.log2(s_N)
s_log_time = np.log2(s_time)



In [36]:
df = pd.DataFrame({'N':s_log_N,'time':s_log_time})
X_train, X_test, y_train, y_test = train_test_split(df[["N"]], df[["time"]])


regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
print "The slope is : %0.2f" % (regr.coef_)
predicted_all = regr.predict(df[["N"]])
predicted_train = regr.predict(X_train)
predicted_test = regr.predict(X_test)

#Plotting the results and comparaison with the test sample
plt.scatter(s_log_N,s_log_time,  color='yellow',alpha=0.6)
plt.plot(df[["N"]], predicted_all, c='g', alpha=0.5)
#The "test sample", i.e. the 2 point in red are actually quite closed to the prediction
plt.scatter(X_test, y_test,s=60, c='r', alpha=0.9)


The slope is : 3.39
Out[36]:
<matplotlib.collections.PathCollection at 0x1138b4690>

In [45]:
plt.scatter(predicted_test, (predicted_test- y_test)/y_test*100, c='g', s=40)
plt.scatter(predicted_train, (predicted_train- y_train)/y_train*100, c='b', s=40, alpha=0.5)
print("The test sample error is "+", ".join(["%0.2f%%" % x[0] for x in ((predicted_test-y_test)/y_test)*100]))


The test sample error is 1.06%, -5.75%